import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#rom IPython.display import Image
from sklearn import tree
from os import system
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score, classification_report
data = pd.read_csv('bank-full.csv')
a. Univariate analysis – data types and description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers.
b. Strategies to address the different data challenges such as data pollution, outlier’s treatment and missing values treatment.
c. Please provide comments in jupyter notebook regarding the steps you take and insights drawn from the plots.
data.head(10)
# datatype
data.info()
data.describe().transpose()
data.shape
data.nunique()
# Values counts of categorical variables to find incorrect data
cat_columns = ['job','marital', 'education', 'default','housing', 'loan','contact','poutcome','Target','previous']
for i in cat_columns:
x = data[i].value_counts()
print(i)
print(x)
print("")
#Univariate
from pandas_profiling import ProfileReport
ProfileReport(data)
sns.boxplot(data['balance'])
# from data summary, ProfileReport and above plot, balance has outliers
# balance0
#(mean)1362.272058
#(min)-8019.0
#(max) 102127.0
from scipy.stats import zscore
balance_outliers = zscore(data['balance'])
print(balance_outliers)
sns.boxplot(data['Target'],data['balance'])
sns.boxplot(data['Target'],data['age'])
for i in ['age','balance']:
sns.distplot(data[i])
plt.show()
#null values
data.isnull().sum()
# incorrect imputation
data.isna().values.any()
There is no missing value in this dataset. There are values like “unknown”, “others”.
From above analysis, we know that
a. Bi-variate analysis between the predictor variables and target column. Comment on your findings in terms of their relationship and degree of relation if any. Visualize the analysis using boxplots and pair plots, histograms or density curves. Select the most appropriate attributes.
b. Please provide comments in jupyter notebook regarding the steps you take and insights drawn from the plots
data["Target_Int"] = data["Target"].apply(lambda x: 0 if x == 'no' else 1)
data.head(10)
corr = data.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)
sns.pairplot(data[['age','balance','duration','campaign']]);
# Handle incorrect values
data[['poutcome']] = data[['poutcome']].replace(['other'],'unknown')
# Drop columns which do not have any impact
newData = data.drop('contact',axis = 1)
newData = newData.drop('marital',axis = 1)
newData = newData.drop('Target',axis = 1)
#newData = data
newData.head()
newData.shape
# Drop outliers
condition1 = (balance_outliers >3) | (balance_outliers <-3 )
newData = newData.drop(newData[condition1].index, axis = 0, inplace = False)
newData.shape
X = newData.drop('Target_Int',axis=1)
Y = newData['Target_Int']
# OnehotEncoding
X = pd.get_dummies(X, drop_first=True)
X.columns
#Create the training set and test set in ratio of 70:30
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print("Total records: ", len(newData.index))
print("Training records: ", len(x_train))
print("Testing records: ",len(x_test))
print("{0:0.2f}% data is in training set".format((len(x_train)/len(newData.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(newData.index)) * 100))
pos = y_train[y_train == 1].count()
neg = y_train[y_train == 0].count()
print("Positive = ", pos)
print("Negative = ", neg)
perc = pos/len(y_train)
print(perc)
pos = y_test[y_test == 1].count()
neg = y_test[y_test == 0].count()
print("Positive = ", pos)
print("Negative = ", neg)
perc = pos/len(y_test)
print(perc)
def draw_cm( actual, predicted ):
cm = metrics.confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True,fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
algo_names=[]
train_scores=[]
test_scores=[]
recall_scores =[]
precision_scores = []
f1_scores =[]
roc_auc_scores = []
def PrintMetrics(name,model, predicted):
algo_names.append(name)
train_score = model.score(x_train, y_train)
train_scores.append(train_score)
print("Training accuracy :",train_score)
print()
test_score = round(model.score(x_test, y_test),3)
test_scores.append(test_score)
print("Testing accuracy:",test_score)
print()
print('Confusion Matrix')
print(draw_cm(y_test,predicted))
print()
recall = recall_score(y_test,predicted)
recall_scores.append(recall)
print("Recall:",recall)
print()
precision = precision_score(y_test,predicted)
precision_scores.append(precision)
print("Precision:",precision)
print()
f1Value =f1_score(y_test,predicted)
f1_scores.append(f1Value)
print("F1 Score:",f1Value)
print()
roc = roc_auc_score(y_test,predicted)
roc_auc_scores.append(roc)
print("Roc Auc Score:",roc)
print()
print("Classification Report:")
print(classification_report(y_test,predicted))
#Logistic Regression
#solver = ['newton-cg','lbfgs','liblinear','sag','saga']
#for i in solver:
logModel = LogisticRegression() # changing values of solver
logModel.fit(x_train, y_train)
y_predict = logModel.predict(x_test)
coef_df = pd.DataFrame(logModel.coef_)
coef_df['intercept'] = logModel.intercept_
print(coef_df)
print('LogisticRegression Metrics:')
print()
PrintMetrics('LogisticRegression',logModel,y_predict)
#Decision Tree algorithm
dTree = DecisionTreeClassifier(criterion = 'entropy')
dTree.fit(x_train, y_train)
dTreePredict = dTree.predict(x_test)
#Decision Tree metrics
print('Decision Tree Metrics:')
print()
PrintMetrics('Decision Tree',dTree,dTreePredict)
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
features = [col for col in X.columns if col != 'Target_Int']
dot_data = StringIO()
export_graphviz(dTree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['5','6','8'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('wines_pruned.png')
Image(graph.create_png())